Time Series Analysis: Vocational School Graduates’ Unemployment

Author

Emily Mitchum

Imports

Code
library(astsa)
library(stats)
library(ggplot2)
library(dplyr)

Attaching package: 'dplyr'
The following objects are masked from 'package:stats':

    filter, lag
The following objects are masked from 'package:base':

    intersect, setdiff, setequal, union
Code
library(ggplot2)
library(plotly)

Attaching package: 'plotly'
The following object is masked from 'package:ggplot2':

    last_plot
The following object is masked from 'package:stats':

    filter
The following object is masked from 'package:graphics':

    layout
Code
library(patchwork)
library(TSstudio)
library(zoo) 

Attaching package: 'zoo'
The following objects are masked from 'package:base':

    as.Date, as.Date.numeric
Code
library(lubridate)

Attaching package: 'lubridate'
The following objects are masked from 'package:base':

    date, intersect, setdiff, union
Code
library(dplyr)
library(tidyverse)
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ forcats 1.0.0     ✔ stringr 1.5.1
✔ purrr   1.0.2     ✔ tibble  3.2.1
✔ readr   2.1.5     ✔ tidyr   1.3.1
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ plotly::filter() masks dplyr::filter(), stats::filter()
✖ dplyr::lag()     masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

Read in Data

Code
unemp_df <- read.csv('/Users/Emi/Library/CloudStorage/GoogleDrive-emilymitchum265@gmail.com/Shared drives/DSAN/DSAN 5100/Final Project/vocational_school_employment_outcomes/Cleaned data/Cleaned_monthly_unemployment.csv')
head(unemp_df)
        date    education_level unemployment_rate
1 2019-01-01 some_college_assoc               3.5
2 2019-02-01 some_college_assoc               3.2
3 2019-03-01 some_college_assoc               3.4
4 2019-04-01 some_college_assoc               3.1
5 2019-05-01 some_college_assoc               2.7
6 2019-06-01 some_college_assoc               3.0

Create TS Object

Code
unemp_df$date <- as.Date(unemp_df$date, "%Y-%m-%d")
unemp_df$unemployment_rate <- as.numeric(unemp_df$unemployment_rate)
max <- max(unemp_df$unemployment_rate)
min <- min(unemp_df$unemployment_rate)


fig <- plot_ly(
  unemp_df,
  x = ~date,
  y = ~unemp_df$unemployment_rate,
  color = ~education_level,        
  colors = c(
    "ba_only" = "#E64B35",
    "some_college_assoc" = "#4DBBD5",
    "high_school" = "#00A087"
  ),
  type = 'scatter',
  mode = 'lines'
)

fig <- fig |>
  layout(
    title = "Unemployment Rate (2019–2025)",
    xaxis = list(title = "Date"),
    yaxis = list(title = "Employment-Population Ratio"),
    legend = list(title = list(text = "<b>Education Level</b>"))
      # ,
    # shapes = list(
    #   list(type = 'rect',
    #        x0 = "2019-12-01", x1 = "2020-12-01",
    #        y0 = min-2, y1 = max,
    #        line = list(color = 'rgba(0, 0, 255, 0.5)', width = 2),
    #        fillcolor = 'rgba(0, 0, 255, 0.2)')
    # )
    # annotations = list(
    #   list(
    #     x = "2020-06-01",
    #     y = max + 2,
    #     text = "COVID-19",
    #     showarrow = FALSE,
    #     font = list(color = 'black', size = 12)
    #   )
  # )
)


fig

Convert to Time-Series Object

Code
vocational_unemp <- unemp_df[unemp_df$education_level == "some_college_assoc",]
###### Convert to Time Series Object ######
ts_unemp <- ts(vocational_unemp$unemployment_rate, start = c(2019, 1), frequency = 12)

Analyze Lag Plots

Code
ts_lags(ts_unemp)

Overall, the lag plots do not show strong linear relationships across the higher order lags. Lag 1 shows the strongest upward pattern, however, it is not clearly linear. This suggests that this month’s unemployment rate for vocational grads is moderately correlated with last month’s unemployment rate. Lags 2 and 3 also show positive sloping patterns. However, after Lag 4, the plots begin to appear random.

This pattern indicates mild autocorrelation at short lags. After lags 3-4, there is no long-term autocorrelation or seasonality.

ACF and PACFs

Code
library(forecast)
Registered S3 method overwritten by 'quantmod':
  method            from
  as.zoo.data.frame zoo 

Attaching package: 'forecast'
The following object is masked from 'package:astsa':

    gas
Code
ggAcf(ts_unemp, frequency=12)
Warning in ggplot2::geom_segment(lineend = "butt", ...): Ignoring unknown
parameters: `frequency`

Code
ggPacf(ts_unemp, frequency=12)
Warning in ggplot2::geom_segment(lineend = "butt", ...): Ignoring unknown
parameters: `frequency`

The ACF plot shows strong auto-correlation at initial lags, indicating that last month’s unemployment rate is very predictive of this month’s unemployment rate.

The ACF plot also shows that vocational graduates’ unemployment rates are non-stationary, with values decaying slowly over time, rather than rapidly. This suggests that observations far in the past influence the current value. The PACF also indicates that current values depend heavily on the previous values, with a very large spike at lag 1.

Code
tseries::adf.test(ts_unemp)

    Augmented Dickey-Fuller Test

data:  ts_unemp
Dickey-Fuller = -2.7152, Lag order = 4, p-value = 0.2829
alternative hypothesis: stationary

The P-value obtained from the ADF test is greater than 0.05. Therefore, we do not have enough evidence to reject the null hypothesis at 5% significance level. This indicates the data is non-stationary, which aligns with the ADF and PACF plots we obtained.

Code
dc = decompose(ts_unemp)

df <- tibble(
  date = as.Date(time(ts_unemp)),
  data = dc$x,
  trend = dc$trend,
  seasonal = dc$seasonal,
  remainder = dc$random
)

df %>%
  pivot_longer(-date) %>%
  ggplot(aes(date, value)) +
  geom_line() +
  facet_wrap(~name, scales = "free_y", ncol = 1) +
  scale_x_date(date_breaks = "1 year", date_labels = "%Y") +
  theme_minimal()
Warning: Removed 6 rows containing missing values or values outside the scale range
(`geom_line()`).

Seasonal: The seasonal component represents regular, repeating patterns throughout the year. While the plot appears to show a consistent yearly rise, followed by a dip, this pattern doesn’t seem to represent true seasonality. The lag plots and ACF and PACF plots we generated earlier do not show seasonality. The seasonal pattern shown in the classical decomposition likely is visualizing the assumption of a yearly seasonal structure rather than true seasonality in the unemployment rate.

Trend: The trend line represents the underlying pattern of unemployment over a longer period, excluding short-term fluctuations. The trend shows a rapid increase from 2019 to 2020, followed by the start of a steady decline in late 2020 that has continued through 2025. This very clearly shows the spike in unemployment as a result of COVID-19, and the economy’s slow recovery over time.

Remainder: The remainder component captures the random, unpredictable variations in the data not explained by the trend or seasonality. The remainder clearly shows the steep spike in vocational graduates’ unemployment rate at the start of 2020 due to the COVID-19 pandemic. After 2020, the remainder values decreases and stabilizes, showing that the shocks caused by COVID-19 eventually diminished once the economy began to transition into recovery.

Decomposition

Code
diff_1 <- diff(ts_unemp)

diff_2 <- diff(ts_unemp, differences = 2)

acf_plot_1 <- ggAcf(diff_1,50) +
  ggtitle("ACF of First-Order Differenced Series") +
  theme_minimal()

acf_plot_2 <- ggAcf(diff_2,50) +
  ggtitle("ACF of Second-Order Differenced Series") +
  theme_minimal()

acf_plot_1/acf_plot_2

The ACF and PACF plots of the first and second differenced series both suggest that the data only needs one round of differencing. In the first-order differenced ACF, the data appears near-stationary, with most auto-correlations within the confidence bands and fluctuating around a constant mean. The second-order differenced ACF has a very large negative lag-1 auto-correlation, which is a sign that the data is over-differenced.

Code
p1<-ggPacf(diff_1,50) +
  ggtitle("PACF of First Differenced Series") +
  theme_minimal()

p2<-ggPacf(diff_2,50) +
  ggtitle("PACF of Second Differenced Series") +
  theme_minimal()

p1/p2

The first order differenced PACF tells a similar story, with a most partial correlations falling within the confidence bounds and no signs of trend or seasonality. The second order differenced PACF again shows a very large negative value for lag 1, and a pattern of negative partial correlations. This likely means the second-order difference has introduced more complexity than necessary for this data.

Code
library(knitr)
library(kableExtra)

Attaching package: 'kableExtra'
The following object is masked from 'package:dplyr':

    group_rows
Code
p_range <- 0:2
d_range <- 1
q_range <- 0:2

n_combinations <- length(p_range) * length(d_range) * length(q_range)

results_matrix <- matrix(NA, nrow = n_combinations, ncol = 6)

i <- 1

for (q in q_range) {
  for (p in p_range) {
    d <- d_range

    model <- Arima(ts_unemp, order = c(p, d, q), include.drift = TRUE)

    results_matrix[i, ] <- c(p, d, q, model$aic, model$bic, model$aicc)

    i <- i + 1
  }
}

results_df <- as.data.frame(results_matrix)
colnames(results_df) <- c("p", "d", "q", "AIC", "BIC", "AICc")

highlight_aic_row <- which.min(results_df$AIC)
highlight_bic_row <- which.min(results_df$BIC)

knitr::kable(results_df, align = 'c', caption = "Comparison of ARIMA Models") %>%
  kable_styling(full_width = FALSE, position = "center") %>%
  row_spec(c(highlight_aic_row,highlight_bic_row), bold = TRUE, background = "#FFFF99") %>%
  row_spec(highlight_bic_row, bold = TRUE, background = "#90EE90")
Comparison of ARIMA Models
p d q AIC BIC AICc
0 1 0 287.0250 291.7891 287.1809
1 1 0 288.9916 296.1377 289.3074
2 1 0 289.3368 298.8649 289.8701
0 1 1 288.9780 296.1241 289.2938
1 1 1 284.5947 294.1228 285.1280
2 1 1 286.0034 297.9136 286.8143
0 1 2 288.4001 297.9282 288.9334
1 1 2 285.9334 297.8436 286.7443
2 1 2 286.5578 300.8499 287.7084
Code
auto.arima(ts_unemp)
Series: ts_unemp 
ARIMA(0,1,0) 

sigma^2 = 2.014:  log likelihood = -141.51
AIC=285.03   AICc=285.08   BIC=287.41
Code
set.seed(2025)

first_model_output <- capture.output(sarima(ts_unemp, 0, 1, 0))